import pandas as pd
import numpy as np
import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode()
# make up events DataFrame
events = ['Home', 'Cart', 'Product', 'Cancel', 'Purchase', 'Category', 'Brand', 'History']
platforms = ['Andriod', 'iOS', 'PC']
def random_dates(start, end, n):
start_u = start.value//10**9
end_u = end.value//10**9
return pd.to_datetime(np.random.randint(start_u, end_u, n), unit='s')
start = pd.to_datetime('2022-01-01')
end = pd.to_datetime('2022-08-01')
rng = np.random.default_rng()
df = pd.DataFrame(rng.integers(0, 5000, size=(10000, 1)), columns=['user_id'])
df['time'] = random_dates(start, end, n= 10000)
df['event_name'] = np.random.choice(list(events), len(df))
df['paltform'] = np.random.choice(list(platforms), len(df))
df.head()
| user_id | time | event_name | paltform | |
|---|---|---|---|---|
| 0 | 2613 | 2022-07-11 10:06:07 | Brand | PC |
| 1 | 4788 | 2022-07-17 22:14:09 | Cancel | PC |
| 2 | 2729 | 2022-05-25 06:36:18 | Home | Andriod |
| 3 | 3160 | 2022-05-15 16:10:41 | Home | Andriod |
| 4 | 1073 | 2022-01-30 04:41:28 | Purchase | PC |
def filter_starting_step(x, starting_step, n_steps):
"""
Function used to return the first n_steps for each user starting from the "starting_step".
The function will be used to generate the event sequence journey for each user.
"""
starting_step_index = x.index(starting_step)
return x[starting_step_index: starting_step_index + n_steps]
import random
#You can also generate random colors if you don't have a list in mind
#number_of_colors = len(events)
#color = ["#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)])
# for i in range(number_of_colors)]
color_n = ['#55CBCD', '#CBAACB', '#FF968A', '#4DD091', '#FF5768', '#0065A2', '#57838D', '#FFC500']
color_l = ['#D4F0F0', '#ECD5E3', '#FFDBCC', '#E0F8F5', '#FFEFFF', '#9EDDEF', '#D7D2EA', '#FFF7C2']
color_dictn = {events[i]: color_n[i] for i in range(len(events))}
color_dictl = {events[i]: color_l[i] for i in range(len(events))}
def user_journey(df, starting_step, n_steps=5):
# sort df by time
events = df.sort_values(['user_id', 'time'])
# find the users that have performed the starting_step
valid_ids = df[df['event_name'] == starting_step]['user_id'].unique()
# plan out the journey per user, with each step in a separate column
flow = df[(df['user_id'].isin(valid_ids))] \
.groupby('user_id') \
.event_name.agg(list) \
.to_frame()['event_name'] \
.apply(lambda x: x[x.index(starting_step): x.index(starting_step) + n_steps] ) \
.to_frame() \
['event_name'].apply(pd.Series).fillna('End')
# add the step number as prefix to each step
for i, col in enumerate(flow.columns):
flow[col] = '{}: '.format(i + 1) + flow[col].astype(str)
# count the number of identical journeys up the max step defined
flow = flow.groupby(list(range(n_steps))) \
.size() \
.to_frame() \
.rename({0: 'count'}, axis=1) \
.reset_index()
# transform flow df into a source-target pair
cat_cols = flow.columns[:-1].values.tolist()
for i in range(len(cat_cols) - 1):
if i == 0:
source_target_df = flow[[cat_cols[i], cat_cols[i + 1], 'count']]
source_target_df.columns = ['source', 'target', 'count']
else:
temp_df = flow[[cat_cols[i], cat_cols[i + 1], 'count']]
temp_df.columns = ['source', 'target', 'count']
source_target_df = pd.concat([source_target_df, temp_df])
source_target_df = source_target_df.groupby(['source', 'target']).agg({'count': 'sum'}).reset_index()
# filter out the end step
source_target_df = source_target_df[(~source_target_df['source'].str.contains('End')) &
(~source_target_df['target'].str.contains('End'))]
# create the nodes labels list
label_target = list(set(source_target_df.target.tolist()))
label_source = list(set(source_target_df.source.tolist()))
label_list = list(set(label_target + label_source))
# create a list of colours for the nodes
colors_node = []
for i in label_list:
for key, val in color_dictn.items():
if i.find(key) > 0:
#print(key, i)
colors_node.append(val)
# create a list of colours for the links
colors_link = []
for i in source_target_df.target.tolist():
for key, val in color_dictl.items():
if i.find(key) > 0:
colors_link.append(val)
# add index for source-target pair
source_target_df['source_id'] = source_target_df['source'].apply(lambda x: label_list.index(x))
source_target_df['target_id'] = source_target_df['target'].apply(lambda x: label_list.index(x))
return label_list, colors_node, colors_link, source_target_df
def plot_user_flow(df, starting_step, n_steps=5, title='Sankey Diagram'):
label_list, colors_node, colors_link, source_target_df = user_journey(df, starting_step, n_steps)
# creating the sankey diagram
data = dict(
type='sankey',
node=dict(
pad=20,
thickness=20,
color=colors_node,
line=dict(
color="black",
width=0.5
),
label=label_list
),
link=dict(
source=source_target_df['source_id'].values.tolist(),
target=source_target_df['target_id'].values.tolist(),
value=source_target_df['count'].astype(int).values.tolist(),
color = colors_link,
hoverlabel=dict(
bgcolor='#C2C4C7')
)
)
# set window width so that steps are evenly spaced out
if n_steps < 5:
width = None
else:
width = n_steps * 250
layout = dict(
height=700,
width=width,
margin=dict(t=30, l=0, r=0, b=30),
title=title,
font=dict(
size=16
)
)
fig = dict(data=[data], layout=layout)
return fig
fig = plot_user_flow(df, starting_step = 'Home', n_steps=4, title='Customer Journey Sankey Diagram')
iplot(fig)